In our hidden markov model sequence analysis of human immune genes of interest across microbial metagenomes, we find certain genes with high sequence conservation in both eukaryotes and some prokaryotes. Some of these genes are involve in basic cellular functions, such as translation, while others are involved in more specific functions, such as immune response. To determine which genes we should anticipate to be conserved across the tree of life, we will use the RefSeq database to sample representative genomes from each genus. With this sequence set, we will create a custom BLAST database and queary each gene of interest against it. We will then use the BLAST results to determine the degree of conservation of each gene across the tree of life.
Objectives
Quantify the degree of conservation of human immune genes across the tree of life
url_list <-read.delim(glue("{wkdir}/data/interim/refseq_genomes/genus_sampling/urls.txt"),header =FALSE, sep ="\t")download_dir <-"/central/scratch/jbok/refseq_proteins/genus_sampling/download"faa_dir <-glue("{download_dir}/protein_faa")fna_dir <-glue("{download_dir}/genomic_fna")shell_do(glue("mkdir -p {faa_dir}"))shell_do(glue("mkdir -p {fna_dir}"))# We should have 30,957 genomesurl_list %<>%mutate(file_header = strex::str_after_last(V1, "/"),rsync_dl_link_faa =glue("{gsub('https', 'rsync', V1)}/","{file_header}_protein.faa.gz" ),rsync_dl_link_fna =glue("{gsub('https', 'rsync', V1)}/","{file_header}_genomic.fna.gz" ),dl_path_faa =glue("{faa_dir}/{file_header}_protein.faa.gz" ),dl_path_fna =glue("{fna_dir}/{file_header}_genomic.fna.gz" ),dl_path_faa_exists =map_lgl(dl_path_faa, file.exists),dl_path_fna_exists =map_lgl(dl_path_fna, file.exists) )# Once this is complete, continue bash run above. Note, it will also dramatically speed up the process if you now comment out the download_genomes() modules
# generate list of protein files from ncbi that downloaded successfully ref_prot_df <-tibble("refseq_prot_path"=list.files(path = refseq_proteins_dir, full.names =TRUE)) %>%mutate(ftp_name =str_after_last(refseq_prot_path, "/") %>%gsub("_protein.faa.gz", "", .) )# create a data frame for batch processingbatch_input_df <-tibble("refseq_fna_path"=list.files(path =glue("{scratch_dir}/refseq_proteins/genus_sampling/download/genomic_fna"),full.names =TRUE ) ) %>%mutate(perfect_refseq_fna_path =case_when(grepl("\\.gz$", refseq_fna_path) ~ refseq_fna_path,TRUE~glue("{refseq_fna_path}.gz") ),ftp_name =str_after_last(perfect_refseq_fna_path, "/") %>%gsub("_genomic.fna.gz", "", .),# generate list of desired fasta outputsprodigal_path = purrr::map_chr( perfect_refseq_fna_path,~glue("{scratch_dir}/prodigal_proteins/","{gsub('genomic.fna.gz', 'protein.fasta', basename(.))}" ) ),prodigal_unprocessed_lgr = purrr::map_lgl(prodigal_path, ~!file.exists(.)),prodigal_filesize = purrr::map_dbl(prodigal_path, ~file.size(.)),transeq_path =glue("{scratch_dir}/transeq_proteins/","{ftp_name}_protein.fasta" ),transeq_path_exists = purrr::map_lgl(transeq_path, file.exists),transeq_filesize = purrr::map_dbl(transeq_path, ~file.size(.)) ) %>%full_join(ref_prot_df) %>%full_join(merged_meta)saveRDS( batch_input_df,glue("{genus_dir}/{Sys.Date()}_fna_and_prodigal_paths.rds"))# RENAME PROBLEMATIC FILE NAMES (run once and then rerun chunk)ref_prot_df %>%filter(grepl(".fasta.gz_", refseq_prot_path) ) %>%pull(refseq_prot_path) %>% purrr::map(~shell_do(glue("mv {.} {gsub('.fasta.gz_protein', '_protein', .)}") ))batch_input_df %>%filter(grepl(".fa.gz_genomic", refseq_fna_path) ) %>%pull(refseq_fna_path) %>% purrr::map( ~shell_do(glue("mv {.} {gsub('.fa.gz_genomic', '_genomic', .)}")))batch_input_df %>%filter(grepl(".fasta.gz_genomic", refseq_fna_path) ) %>%pull(refseq_fna_path) %>% purrr::map( ~shell_do(glue("mv {.} {gsub('.fasta.gz_genomic', '_genomic', .)}")))
Code
# ORDER OF PRIORITY ----# 1) protein_aa info available from ncbi# 2) prodigal extraction of CDS from genome fna (only for smaller genomes)# 3) transeq brute force translation from genome fna (only for larger genomes)# #' create one common output path for all files# final_set <- batch_input_df %>%# mutate(# protein_path = case_when(# !is.na(refseq_prot_path) ~ refseq_prot_path,# !is.na(prodigal_filesize) & prodigal_filesize > 0 ~ prodigal_path# # transeq_path_exists ~ transeq_path# ),# clean_protein_path =# glue("{scratch_dir}/cleaned_fasta/{ftp_name}_cleaned-protein.fasta")# ) %>%# drop_na(protein_path)# final_set %>% glimpserefseq_protein_paths <-tibble("refseq_protein_paths"=list.files(path =glue("{scratch_dir}/refseq_proteins/genus_sampling/download/protein_faa" ),full.names =TRUE )) %>%mutate(ftp_name =gsub("_protein.faa", "", basename(refseq_protein_paths)))prodigal_paths <-tibble("prodigal_paths"=list.files(path =glue("{scratch_dir}/prodigal_proteins" ),full.names =TRUE )) %>%mutate(ftp_name =gsub("_protein.fasta", "", basename(prodigal_paths)))final_set <- refseq_protein_paths %>%full_join(prodigal_paths) %>%left_join(merged_meta, by ="ftp_name") %>%mutate(protein_path =case_when(!is.na(refseq_protein_paths) ~ refseq_protein_paths,TRUE~ prodigal_paths ),clean_protein_path =glue("{scratch_dir}/cleaned_fasta/{ftp_name}_cleaned-protein.fasta") ) %>%drop_na(genome) %>%drop_na(protein_path)final_set %>% glimpsefinal_set$protein_path %>% length %>% uniquefinal_set$genome %>% length %>% uniquefinal_set$protein_path %>%keep(grepl("refseq_proteins", .)) %>% lengthfinal_set$protein_path %>%keep(grepl("prodigal", .)) %>% lengthsaveRDS(final_set, glue("{wkdir}/data/interim/refseq_genomes/","genus_sampling/{Sys.Date()}_translated-proteins.rds"))
For each file loop through files: 1) copy genome ID at the start of the file, and replace spaces with underscore 2) concatenate files 3) replace commas with underscore in fasta headers
# BLAST 2.12.0+# creating a blast database from phylogenetic catalogcreate_db_cmd <-glue("makeblastdb"," -in {scratch_dir}/2023-06-04_RefSeq-genus.fasta"," -dbtype prot"," -out {genus_dir}/blastdb/2023-06-04_RefSeq-genus")slurm_shell_do(cmd = create_db_cmd,memory ="200G",ncpu =1,walltime =86400)# Building a new DB, current time: 06/04/2023 11:47:35# New DB name: /central/groups/MazmanianLab/joeB/Microbiota-Immunomodulation/Microbiota-Immunomodulation/data/interim/refseq_genomes/genus_sampling/blastdb/2023-06-04_RefSeq-genus# New DB title: /central/scratch/jbok/2023-06-04_RefSeq-genus.fasta# Sequence type: Protein# Keep MBits: T# Maximum file size: 1000000000B# Adding sequences from FASTA; added 156526985 sequences in 3148.36 seconds.